-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AArch64] Optimize extending loads of small vectors #163064
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Guy David (guy-david) ChangesReduces the total amount of loads and the amount of moves between SIMD registers and general-purpose registers. Patch is 20.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163064.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 69651168f8539..274c0dd6f42cc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23207,6 +23207,99 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
}
+// Helper function to optimize small vector load + extension patterns.
+// These patterns would otherwise be scalarized into inefficient sequences.
+static SDValue performSmallVectorLoadExtCombine(SDNode *N, SelectionDAG &DAG) {
+ // Don't optimize if NEON is not available. Without NEON, the backend
+ // will need to scalarize these operations anyway.
+ const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ if (!Subtarget.isNeonAvailable())
+ return SDValue();
+ // Don't optimize if SVE is being used for fixed-length vectors, because it
+ // has native support for these patterns.
+ if (Subtarget.useSVEForFixedLengthVectors())
+ return SDValue();
+
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::ZERO_EXTEND && Opcode != ISD::SIGN_EXTEND &&
+ Opcode != ISD::ANY_EXTEND)
+ return SDValue();
+
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() != ISD::LOAD)
+ return SDValue();
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD || !LD->hasOneUse() ||
+ LD->isVolatile())
+ return SDValue();
+
+ EVT MemVT = LD->getMemoryVT();
+ EVT ResVT = N->getValueType(0);
+ // Check if this is a small vector pattern we want to optimize.
+ if (MemVT != MVT::v2i8 && MemVT != MVT::v2i16)
+ return SDValue();
+
+ unsigned NumElts = MemVT.getVectorNumElements();
+ unsigned SrcEltBits = MemVT.getScalarSizeInBits();
+ unsigned DstEltBits = ResVT.getScalarSizeInBits();
+ unsigned LoadBits = NumElts * SrcEltBits;
+
+ // Check alignment: the optimization loads a larger scalar, which may be
+ // unaligned, compared to what the original load will be legalized into.
+ Align Alignment = LD->getAlign();
+ if (Subtarget.requiresStrictAlign() && Alignment < LoadBits)
+ return SDValue();
+
+ // The transformation strategy:
+ // 1. Load the memory as a large scalar and turn it into a 64-bit vector.
+ // 2. Bitcast to a narrow type (v8i8 or v4i16) that has efficient NEON extend.
+ // 3. Extend using ushll/sshll, extract subvector, repeat as needed.
+
+ // For ANY_EXTEND, we can choose either sign or zero extend - zero is
+ // typically cheaper.
+ if (Opcode == ISD::ANY_EXTEND)
+ Opcode = ISD::ZERO_EXTEND;
+
+ SDLoc DL(N);
+ SDValue Chain = LD->getChain();
+ SDValue BasePtr = LD->getBasePtr();
+ const MachinePointerInfo &PtrInfo = LD->getPointerInfo();
+ MVT LoadTy = MVT::getIntegerVT(LoadBits);
+ SDValue Load = DAG.getLoad(LoadTy, DL, Chain, BasePtr, PtrInfo, Alignment);
+
+ // SCALAR_TO_VECTOR needs to create a 64-bit vector for NEON instructions.
+ // The scalar load is inserted into the lower bits of a 64-bit register.
+ // We determine the appropriate 64-bit vector type based on load size,
+ // then bitcast to v8i8 or v4i16 for efficient ushll/sshll extends.
+ MVT ScalarVecVT = MVT::getVectorVT(LoadTy, 64 / LoadBits);
+ MVT NarrowVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
+ 64 / MemVT.getScalarSizeInBits());
+
+ SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarVecVT, Load);
+ Vec = DAG.getNode(ISD::BITCAST, DL, NarrowVT, Vec);
+ // Extend iteratively: each extend doubles the element size.
+ // We extend the full 64-bit vector to leverage NEON ushll/sshll instructions.
+ while (Vec.getScalarValueSizeInBits() < DstEltBits) {
+ MVT CurVT = Vec.getSimpleValueType();
+ unsigned NextBits = CurVT.getScalarSizeInBits() * 2;
+ MVT WideVT = MVT::getVectorVT(MVT::getIntegerVT(NextBits),
+ CurVT.getVectorNumElements());
+ Vec = DAG.getNode(Opcode, DL, WideVT, Vec);
+
+ // Extract only when: excess elements + still wide + done extending.
+ bool HasExcess = WideVT.getVectorNumElements() > NumElts;
+ bool StaysWide = WideVT.getSizeInBits() >= 64;
+ bool IsDone = NextBits >= DstEltBits;
+ if (HasExcess && StaysWide && IsDone) {
+ MVT ExtractVT = MVT::getVectorVT(WideVT.getScalarType(), NumElts);
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Vec,
+ DAG.getConstant(0, DL, MVT::i64));
+ }
+ }
+
+ return DAG.getMergeValues({Vec, Load.getValue(1)}, DL);
+}
+
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -23256,6 +23349,12 @@ static SDValue performExtendCombine(SDNode *N,
NewAnyExtend);
}
+ // Try to optimize small vector load + extension patterns
+
+ // Try to optimize small vector load + extension patterns
+ if (SDValue Result = performSmallVectorLoadExtCombine(N, DAG))
+ return Result;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5ad9ad0..bc0edc9b5eca6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) {
define <2 x i16> @test1(ptr %v2i16_ptr) {
; CHECK-LE-LABEL: test1:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #2
-; CHECK-LE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test1:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #2
-; CHECK-BE-NEXT: ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i16 = load <2 x i16>, ptr %v2i16_ptr
@@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) {
define <2 x i8> @test3(ptr %v2i8_ptr) {
; CHECK-LE-LABEL: test3:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-LE-NEXT: add x8, x0, #1
-; CHECK-LE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test3:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-BE-NEXT: add x8, x0, #1
-; CHECK-BE-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%v2i8 = load <2 x i8>, ptr %v2i8_ptr
@@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) {
define <2 x i32> @fsext_v2i32(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i32:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i32:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) {
define <2 x i16> @fsext_v2i16(ptr %a) {
; CHECK-LE-LABEL: fsext_v2i16:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: ldrsb w8, [x0]
-; CHECK-LE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT: fmov s0, w8
-; CHECK-LE-NEXT: mov v0.s[1], w9
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: fsext_v2i16:
; CHECK-BE: // %bb.0:
-; CHECK-BE-NEXT: ldrsb w8, [x0]
-; CHECK-BE-NEXT: ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT: fmov s0, w8
-; CHECK-BE-NEXT: mov v0.s[1], w9
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
; CHECK-BE-NEXT: ret
%x = load <2 x i8>, ptr %a
@@ -497,3 +495,219 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric
%v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1
ret <4 x i8> %v4i8
}
+
+define <2 x i16> @zext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i16:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i16:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i16>
+ ret <2 x i16> %y
+}
+
+define <2 x i32> @zext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = zext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @zext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = zext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i16> @sext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i16:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i16:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i16>
+ ret <2 x i16> %y
+}
+
+define <2 x i32> @sext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr h0, [x0]
+; CHECK-LE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr h0, [x0]
+; CHECK-BE-NEXT: rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i8>, ptr %a
+ %y = sext <2 x i8> %x to <2 x i64>
+ ret <2 x i64> %y
+}
+
+define <2 x i32> @sext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i32:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i32:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i32>
+ ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i64:
+; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: ldr s0, [x0]
+; CHECK-LE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i64:
+; CHECK-BE: // %bb.0:
+; CHECK-BE-NEXT: ldr s0, [x0]
+; CHECK-BE-NEXT: rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT: sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT: ret
+ %x = load <2 x i16>, ptr %a
+ %y = sext <2 x i16> %x to <2 x i64>
+ ret <2 x i64> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666bdbc75..2cd54d4113542 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,23 +222,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: ldrh w8, [x0]
-; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT: ldr s0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
-; CHECK-NEON-NEXT: fmov d0, x8
-; CHECK-NEON-NEXT: mov v0.d[1], x9
-; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: ldrh w8, [x0]
-; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT: ldr s0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
-; CHECK-SVE-NEXT: fmov d0, x8
-; CHECK-SVE-NEXT: mov v0.d[1], x9
-; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
index dc352244deeef..050741f321466 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
@@ -187,7 +187,7 @@ declare void @has_varargs(...) nounwind;
; CHECK-NEXT: .seh_endfunclet
; CHECK-NEXT: .seh_endproc
-declare void @has_sret(ptr sret([100 x i8])) nounwind;
+declare void @has_sret(ptr sret([100 x i8])) nounwind
; CHECK-LABEL: .def $iexit_thunk$cdecl$m100$v;
; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m100$v
; CHECK: // %bb.0:
@@ -478,7 +478,7 @@ declare <4 x i8> @small_vector(<4 x i8> %0) nounwind;
; CHECK-NEXT: stur s0, [x29, #-4]
; CHECK-NEXT: blr x16
; CHECK-NEXT: stur w8, [x29, #-8]
-; CHECK-NEXT: ldur s0, [x29, #-8]
+; CHECK-NEXT: ldur s0, [x29, #-8]
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: .seh_startepilogue
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index cabb0e7278e40..d646cfe9072b5 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -263,16 +263,16 @@ define <16 x i16> @load_v16i8(ptr %p) {
define <2 x i16> @std_v2i8_v2i16(ptr %p) {
; CHECK-LABEL: std_v2i8_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: ldrb w9, [x0, #3]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: mov v0.s[1], w9
-; CHECK-NEXT: ldrb w9, [x0, #1]
-; CHECK-NEXT: mov v1.s[1], w9
-; CHECK-NEXT: shl v0.2s, v0.2s, #3
-; CHECK-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT: ldr h0, [x0, #2]
+; CHECK-NEXT: ldr h1, [x0]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: mov h2, v0.h[0]
+; CHECK-NEXT: mov h3, v1.h[0]
+; CHECK-NEXT: mov v2.h[2], v0.h[1]
+; CHECK-NEXT: mov v3.h[2], v1.h[1]
+; CHECK-NEXT: shl v0.2s, v2.2s, #3
+; CHECK-NEXT: add v0.2s, v3.2s, v0.2s
; CHECK-NEXT: ret
%l1 = load <2 x i8>, ptr %p
%q = getelementptr i8, ptr %p, i32 2
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c4bb6e37d6eaf..b138fa4085427 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) {
define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) {
; CHECK-SD-LABEL: load_v2i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT: add x8, x0, #1
-; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-...
[truncated]
|
2946317
to
011f8fc
Compare
Reduces the total amount of loads and the amount of moves between SIMD registers and general-purpose registers.
011f8fc
to
50496ad
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
4 x i8 is custom lowered. Could we do the same thing here, or does that not work as well?
Reduces the total amount of loads and the amount of moves between SIMD registers and general-purpose registers.